import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
df = pd.read_csv('nfl_qb.csv')
Looking at the first few rows of our data set to ensure there is nothing that immediately seems wrong.
df.head()
| year | player | tm | age | pos | g | gs | wins | loses | ties | ... | rush_rk | rush_att | rush_yds | rush_td | rush_lng | rush_y/a | rush_y/g | fmb | ap_1st | ap_2nd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1970 | John Brodie | SFO | 35 | QB | 14 | 14 | 10 | 3 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 1 | 0 |
| 1 | 1970 | Fran Tarkenton | NYG | 30 | QB | 14 | 14 | 9 | 5 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 |
| 2 | 1970 | Jim Hart | STL | 26 | QB | 14 | 14 | 8 | 5 | 1 | ... | 127 | 18 | 18 | 0 | 4 | 1.0 | 1.3 | 6 | 0 | 0 |
| 3 | 1970 | Roman Gabriel | RAM | 30 | QB | 14 | 14 | 9 | 4 | 1 | ... | 106 | 28 | 104 | 1 | 15 | 3.7 | 7.4 | 6 | 0 | 0 |
| 4 | 1970 | Daryle Lamonica | OAK | 29 | QB | 14 | 14 | 8 | 4 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 |
5 rows × 42 columns
df = df.loc[df['pass_att'] >= 75]
Checking if any cells are N/A.
df.isna().sum()
year 0 player 0 tm 0 age 0 pos 0 g 0 gs 0 wins 0 loses 0 ties 0 pass_rk 0 cmp 0 pass_att 0 cmp% 0 pass_yds 0 pass_td 0 td% 0 int 0 int% 0 pass_lng 0 pass_y/a 0 ay/a 0 y/c 0 pass_y/g 0 rate 0 sk 0 yards_lost_sack 0 sk% 0 ny/a 0 any/a 0 4qc 0 gwd 0 rush_rk 0 rush_att 0 rush_yds 0 rush_td 0 rush_lng 0 rush_y/a 0 rush_y/g 0 fmb 0 ap_1st 0 ap_2nd 0 dtype: int64
Making a new column 'was_ap' that will be one if the player received 1st or 2nd team AP honors that season. We are using a bitwise or operator to accomplish this.
df['was_ap'] = (df['ap_1st'] | df['ap_2nd'])
This formats all of the Seaborn plots in the manner that we want.
sns.set(rc={'figure.figsize':(20,7.55)})
sns.set_style('ticks')
This sorts the players who achieved AP honors to the top of the dataframe so they will be displayed on the top layer of graphs.
graph_df = df.sort_values(by = 'was_ap')
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'g', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'gs', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'wins', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'loses', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df, x = 'year', y = 'ties', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])
<AxesSubplot:xlabel='year', ylabel='ties'>
fig, axs = plt.subplots(nrows = 6)
fig.set_size_inches(15, 18)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_rk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'cmp', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])
sns.scatterplot(data = graph_df, x = 'year', y = 'int', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[5])
<AxesSubplot:xlabel='year', ylabel='int'>
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'cmp%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'td%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'int%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
<AxesSubplot:xlabel='year', ylabel='int%'>
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_lng', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'pass_y/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'ay/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'rate', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
<AxesSubplot:xlabel='year', ylabel='rate'>
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'y/c', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'ny/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'any/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
<AxesSubplot:xlabel='year', ylabel='any/a'>
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'sk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'sk%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'yards_lost_sack', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
<AxesSubplot:xlabel='year', ylabel='yards_lost_sack'>
fig, axs = plt.subplots(nrows = 2)
fig.set_size_inches(15, 6)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = '4qc', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'gwd', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
<AxesSubplot:xlabel='year', ylabel='gwd'>
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_rk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df, x = 'year', y = 'rush_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
<AxesSubplot:xlabel='year', ylabel='rush_td'>
df.drop(columns = {'pos', 'pass_rk', 'rush_rk', 'rush_att', 'rush_yds', 'rush_td', 'rush_lng', 'rush_y/a', 'rush_y/g', 'fmb'}, inplace = True)
def normalize_by_year(df):
def normalize(group):
for column in group.columns:
if ((str(group[column].values[0]).isnumeric() or isinstance(group[column].values[0], float))
and column not in ['year', 'age', 'ap_1st', 'ap_2nd', 'was_ap']):
group[column] = (group[column] - min(group[column])) / (max(group[column]) - min(group[column]))
return group
groups = df.copy().groupby("year")
return groups.apply(normalize)
df_normalized = normalize_by_year(df)
graph_df_n = df_normalized.sort_values(by = 'was_ap')
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'g', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'gs', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'wins', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'loses', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'ties', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])
<AxesSubplot:xlabel='year', ylabel='ties'>
fig, axs = plt.subplots(nrows = 5)
fig.set_size_inches(15, 15)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_att', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'cmp', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_yds', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_td', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'int', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[4])
<AxesSubplot:xlabel='year', ylabel='int'>
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'cmp%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'td%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'int%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
<AxesSubplot:xlabel='year', ylabel='int%'>
fig, axs = plt.subplots(nrows = 4)
fig.set_size_inches(15, 12)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_lng', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'pass_y/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'ay/a', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'rate', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[3])
<AxesSubplot:xlabel='year', ylabel='rate'>
fig, axs = plt.subplots(nrows = 3)
fig.set_size_inches(15, 9)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'sk', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'sk%', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'yards_lost_sack', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[2])
<AxesSubplot:xlabel='year', ylabel='yards_lost_sack'>
fig, axs = plt.subplots(nrows = 2)
fig.set_size_inches(15, 6)
fig.tight_layout(pad = -1.5)
axs[0].set_xticks([])
axs[1].set_xticks([])
sns.scatterplot(data = graph_df_n, x = 'year', y = '4qc', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[0])
sns.scatterplot(data = graph_df_n, x = 'year', y = 'gwd', hue = 'was_ap', size = 'was_ap', size_order = [1, 0], style = 'was_ap', style_order = [1, 0], legend = False, ax = axs[1])
<AxesSubplot:xlabel='year', ylabel='gwd'>
# df.drop(columns = {'pos', 'pass_rk', 'rush_rk', 'rush_att', 'rush_yds', 'rush_td', 'rush_lng', 'rush_y/a', 'rush_y/g', 'fmb'}, inplace = True)
df_normalized.drop(columns = {'year', 'tm', 'age', 'gs', 'wins', 'loses', 'ties', 'pass_lng', 'pass_att',
'int', 'sk', 'sk%', 'yards_lost_sack', '4qc', 'gwd', 'ap_1st', 'ap_2nd',
'player'},
inplace = True)
y = df_normalized['was_ap']
X = df_normalized.drop(columns = {'was_ap'})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12)
DTC = DecisionTreeClassifier(max_depth = 4)
DTC.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(max_depth=4)
print(f"Our model's score against the testing data: {DTC.score(X_test, y_test)*100:4.2f}%")
Our model's score against the testing data: 95.81%
ConfusionMatrixDisplay.from_estimator(DTC, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f1de55406d0>
y_pred = DTC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")
Our model's accuracy for predicting All-Pro's: 44.12%
plot_tree(DTC, feature_names = X.columns, class_names = ['Was NOT AP', 'Was AP'])
[Text(0.5267857142857143, 0.9, 'pass_td <= 0.965\ngini = 0.08\nsamples = 1614\nvalue = [1547, 67]\nclass = Was NOT AP'), Text(0.2857142857142857, 0.7, 'pass_td <= 0.759\ngini = 0.041\nsamples = 1563\nvalue = [1530, 33]\nclass = Was NOT AP'), Text(0.14285714285714285, 0.5, 'ny/a <= 0.858\ngini = 0.014\nsamples = 1437\nvalue = [1427, 10]\nclass = Was NOT AP'), Text(0.07142857142857142, 0.3, 'any/a <= 0.992\ngini = 0.004\nsamples = 1365\nvalue = [1362, 3]\nclass = Was NOT AP'), Text(0.03571428571428571, 0.1, 'gini = 0.003\nsamples = 1364\nvalue = [1362, 2]\nclass = Was NOT AP'), Text(0.10714285714285714, 0.1, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = Was AP'), Text(0.21428571428571427, 0.3, 'cmp <= 0.571\ngini = 0.176\nsamples = 72\nvalue = [65, 7]\nclass = Was NOT AP'), Text(0.17857142857142858, 0.1, 'gini = 0.039\nsamples = 50\nvalue = [49, 1]\nclass = Was NOT AP'), Text(0.25, 0.1, 'gini = 0.397\nsamples = 22\nvalue = [16, 6]\nclass = Was NOT AP'), Text(0.42857142857142855, 0.5, 'ny/a <= 0.934\ngini = 0.298\nsamples = 126\nvalue = [103, 23]\nclass = Was NOT AP'), Text(0.35714285714285715, 0.3, 'cmp% <= 0.937\ngini = 0.248\nsamples = 117\nvalue = [100, 17]\nclass = Was NOT AP'), Text(0.32142857142857145, 0.1, 'gini = 0.207\nsamples = 111\nvalue = [98, 13]\nclass = Was NOT AP'), Text(0.39285714285714285, 0.1, 'gini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = Was AP'), Text(0.5, 0.3, 'int% <= 0.313\ngini = 0.444\nsamples = 9\nvalue = [3, 6]\nclass = Was AP'), Text(0.4642857142857143, 0.1, 'gini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = Was AP'), Text(0.5357142857142857, 0.1, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Was NOT AP'), Text(0.7678571428571429, 0.7, 'rate <= 0.914\ngini = 0.444\nsamples = 51\nvalue = [17, 34]\nclass = Was AP'), Text(0.6785714285714286, 0.5, 'int% <= 0.414\ngini = 0.48\nsamples = 25\nvalue = [15, 10]\nclass = Was NOT AP'), Text(0.6428571428571429, 0.3, 'pass_y/a <= 0.536\ngini = 0.494\nsamples = 18\nvalue = [8, 10]\nclass = Was AP'), Text(0.6071428571428571, 0.1, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Was NOT AP'), Text(0.6785714285714286, 0.1, 'gini = 0.408\nsamples = 14\nvalue = [4, 10]\nclass = Was AP'), Text(0.7142857142857143, 0.3, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = Was NOT AP'), Text(0.8571428571428571, 0.5, 'pass_yds <= 0.78\ngini = 0.142\nsamples = 26\nvalue = [2, 24]\nclass = Was AP'), Text(0.7857142857142857, 0.3, 'int% <= 0.137\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Was NOT AP'), Text(0.75, 0.1, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = Was AP'), Text(0.8214285714285714, 0.1, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Was NOT AP'), Text(0.9285714285714286, 0.3, 'pass_td <= 0.974\ngini = 0.08\nsamples = 24\nvalue = [1, 23]\nclass = Was AP'), Text(0.8928571428571429, 0.1, 'gini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Was NOT AP'), Text(0.9642857142857143, 0.1, 'gini = 0.0\nsamples = 22\nvalue = [0, 22]\nclass = Was AP')]
plot_data = X_test.copy()
plot_data['was_ap'] = y_pred
sns.scatterplot(data = plot_data, x = "pass_td", y = "any/a", hue = "was_ap")
<AxesSubplot:xlabel='pass_td', ylabel='any/a'>
RFC = RandomForestClassifier()
grid_params = {
"n_estimators": range(1, 15, 3),
"max_depth": range(1, 6)
}
RFC_GSCV = GridSearchCV(RFC, grid_params)
RFC_GSCV.fit(X_train, y_train)
RFC_GSCV.best_params_
{'max_depth': 3, 'n_estimators': 7}
best_RFC = RFC_GSCV.best_estimator_
print(f"Our model's score against the training data: {best_RFC.score(X_train, y_train)*100:4.2f}%")
Our model's score against the training data: 97.65%
print(f"Our model's score against the testing data: {best_RFC.score(X_test, y_test)*100:4.2f}%")
Our model's score against the testing data: 96.10%
ConfusionMatrixDisplay.from_estimator(best_RFC, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f1de544c670>
y_pred = best_RFC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")
Our model's accuracy for predicting All-Pro's: 26.47%
DTC = DecisionTreeClassifier()
grid_params = {
"max_depth": range(1, 6)
}
DTC_GSCV = GridSearchCV(DTC, grid_params)
DTC_GSCV.fit(X_train, y_train)
DTC_GSCV.best_params_
{'max_depth': 1}
best_DTC = DTC_GSCV.best_estimator_
print(f"Our model's score against the training data: {best_DTC.score(X_train, y_train)*100:4.2f}%")
Our model's score against the training data: 96.90%
print(f"Our model's score against the testing data: {best_DTC.score(X_test, y_test)*100:4.2f}%")
Our model's score against the testing data: 94.94%
ConfusionMatrixDisplay.from_estimator(best_DTC, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f1de536c100>
y_pred = best_DTC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")
Our model's accuracy for predicting All-Pro's: 29.41%
my_estimators = [
('DTC', DecisionTreeClassifier(max_depth = 2, random_state = 12)),
('RFC', best_RFC),
('LR', LogisticRegression(max_iter = 1000))
]
SC = StackingClassifier(estimators = my_estimators, final_estimator = best_DTC)
SC.fit(X_train, y_train)
StackingClassifier(estimators=[('DTC',
DecisionTreeClassifier(max_depth=2,
random_state=12)),
('RFC',
RandomForestClassifier(max_depth=3,
n_estimators=7)),
('LR', LogisticRegression(max_iter=1000))],
final_estimator=DecisionTreeClassifier(max_depth=1))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. StackingClassifier(estimators=[('DTC',
DecisionTreeClassifier(max_depth=2,
random_state=12)),
('RFC',
RandomForestClassifier(max_depth=3,
n_estimators=7)),
('LR', LogisticRegression(max_iter=1000))],
final_estimator=DecisionTreeClassifier(max_depth=1))DecisionTreeClassifier(max_depth=2, random_state=12)
RandomForestClassifier(max_depth=3, n_estimators=7)
LogisticRegression(max_iter=1000)
DecisionTreeClassifier(max_depth=1)
print(f"Our model's score against the testing data: {SC.score(X_test, y_test)*100:4.2f}%")
Our model's score against the testing data: 95.95%
ConfusionMatrixDisplay.from_estimator(SC, X_test, y_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f1de53baa00>
y_pred = SC.predict(X_test)
print(f"Our model's accuracy for predicting All-Pro's: {recall_score(y_test, y_pred)*100:4.2f}%")
Our model's accuracy for predicting All-Pro's: 32.35%
df2022 = pd.read_csv('2022_nfl_qb.csv')
df2022 = df2022.loc[df2022['Pos'] == 'QB']
df2022.isna().sum()
df2022.loc[df2022['Y/C'].isna()]
| Rk | Player | Tm | Age | Pos | G | GS | QBrec | Cmp | Att | ... | Rate | QBR | Sk | Yds.1 | Sk% | NY/A | ANY/A | 4QC | GWD | Player-additional | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 82 | 83 | Chase Daniel | LAC | 36 | QB | 2 | 0 | NaN | 0 | 2 | ... | 39.6 | 27.4 | 1 | 1 | 33.3 | -0.33 | -0.33 | NaN | NaN | DaniCh00 |
| 84 | 85 | Chad Henne | KAN | 37 | QB | 2 | 0 | NaN | 0 | 2 | ... | 39.6 | 3.5 | 0 | 0 | 0.0 | 0.00 | 0.00 | NaN | NaN | HennCh01 |
2 rows × 32 columns
df2022.dropna(inplace = True)
df_normalized.columns
Index(['g', 'cmp', 'cmp%', 'pass_yds', 'pass_td', 'td%', 'int%', 'pass_y/a',
'ay/a', 'y/c', 'pass_y/g', 'rate', 'ny/a', 'any/a', 'was_ap'],
dtype='object')
df2022.drop(columns = {'Rk', 'Player', 'Tm', 'Age', 'Pos', 'GS', 'QBrec', 'Att', 'Int', '1D', 'Lng',
'QBR', 'Sk', 'Yds.1', 'Sk%', '4QC', 'GWD', 'Player-additional'}, inplace = True)
df2022.columns
Index(['G', 'Cmp', 'Cmp%', 'Yds', 'TD', 'TD%', 'Int%', 'Y/A', 'AY/A', 'Y/C',
'Y/G', 'Rate', 'NY/A', 'ANY/A'],
dtype='object')
df2022.columns = ['g', 'cmp', 'cmp%', 'pass_yds', 'pass_td', 'td%', 'int%', 'pass_y/a', 'ay/a', 'y/c',
'pass_y/g', 'rate', 'ny/a', 'any/a']
df2022['year'] = 2022
df2022_normalized = normalize_by_year(df2022)
df2022_normalized.drop(columns = {'year'}, inplace = True)
SC.predict(df2022_normalized)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
best_DTC.predict(df2022_normalized)
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
best_RFC.predict(df2022_normalized)
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
df2022_normalized
| g | cmp | cmp% | pass_yds | pass_td | td% | int% | pass_y/a | ay/a | y/c | pass_y/g | rate | ny/a | any/a | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.9 | 0.857664 | 0.590643 | 1.000000 | 1.000000 | 0.961538 | 0.297297 | 0.71875 | 0.772727 | 0.804878 | 1.000000 | 0.819797 | 0.767516 | 0.817582 |
| 1 | 0.9 | 0.843066 | 0.789474 | 0.879774 | 0.814815 | 0.826923 | 0.324324 | 0.62500 | 0.681818 | 0.536585 | 0.838085 | 0.789340 | 0.531847 | 0.613187 |
| 2 | 0.9 | 0.770073 | 0.497076 | 0.866490 | 0.814815 | 0.807692 | 0.513514 | 0.59375 | 0.568182 | 0.707317 | 0.819796 | 0.593909 | 0.601911 | 0.597802 |
| 3 | 0.9 | 0.974453 | 0.649123 | 0.844238 | 0.629630 | 0.480769 | 0.216216 | 0.21875 | 0.340909 | 0.170732 | 0.790210 | 0.500000 | 0.267516 | 0.400000 |
| 4 | 0.9 | 1.000000 | 0.619883 | 0.841913 | 0.481481 | 0.326923 | 0.000000 | 0.18750 | 0.340909 | 0.121951 | 0.786982 | 0.482234 | 0.248408 | 0.426374 |
| 5 | 0.9 | 0.773723 | 1.000000 | 0.787778 | 0.703704 | 0.807692 | 0.243243 | 0.71875 | 0.750000 | 0.487805 | 0.713825 | 0.916244 | 0.573248 | 0.665934 |
| 6 | 1.0 | 0.715328 | 0.374269 | 0.770508 | 0.629630 | 0.615385 | 0.459459 | 0.43750 | 0.431818 | 0.609756 | 0.583109 | 0.426396 | 0.449045 | 0.465934 |
| 7 | 0.9 | 0.693431 | 0.543860 | 0.738957 | 0.592593 | 0.634615 | 0.297297 | 0.53125 | 0.545455 | 0.585366 | 0.647660 | 0.586294 | 0.544586 | 0.589011 |
| 8 | 0.9 | 0.620438 | 0.730994 | 0.711724 | 0.629630 | 0.807692 | 0.054054 | 0.75000 | 0.863636 | 0.731707 | 0.611081 | 0.906091 | 0.646497 | 0.775824 |
| 9 | 0.9 | 0.791971 | 0.526316 | 0.709399 | 0.555556 | 0.500000 | 0.378378 | 0.21875 | 0.272727 | 0.243902 | 0.607854 | 0.393401 | 0.168790 | 0.276923 |
| 10 | 0.8 | 0.791971 | 0.713450 | 0.690136 | 0.370370 | 0.307692 | 0.648649 | 0.28125 | 0.181818 | 0.195122 | 0.699301 | 0.289340 | 0.156051 | 0.131868 |
| 11 | 1.0 | 0.726277 | 0.508772 | 0.686483 | 0.703704 | 0.730769 | 0.405405 | 0.31250 | 0.386364 | 0.341463 | 0.478214 | 0.502538 | 0.273885 | 0.386813 |
| 12 | 0.7 | 0.521898 | 0.730994 | 0.684822 | 0.666667 | 1.000000 | 0.270270 | 1.00000 | 1.000000 | 1.000000 | 0.831092 | 1.000000 | 1.000000 | 1.000000 |
| 13 | 0.9 | 0.729927 | 0.584795 | 0.676519 | 0.518519 | 0.519231 | 0.216216 | 0.31250 | 0.409091 | 0.317073 | 0.563744 | 0.512690 | 0.315287 | 0.432967 |
| 14 | 0.9 | 0.594891 | 0.497076 | 0.601461 | 0.333333 | 0.365385 | 0.270270 | 0.40625 | 0.409091 | 0.487805 | 0.462076 | 0.418782 | 0.353503 | 0.408791 |
| 15 | 0.8 | 0.518248 | 0.263158 | 0.584856 | 0.185185 | 0.153846 | 0.216216 | 0.40625 | 0.409091 | 0.682927 | 0.543841 | 0.276650 | 0.242038 | 0.312088 |
| 16 | 0.8 | 0.489051 | 0.678363 | 0.544670 | 0.481481 | 0.730769 | 0.189189 | 0.65625 | 0.727273 | 0.658537 | 0.484669 | 0.771574 | 0.649682 | 0.723077 |
| 17 | 0.9 | 0.569343 | 0.608187 | 0.520757 | 0.296296 | 0.346154 | 0.162162 | 0.31250 | 0.386364 | 0.292683 | 0.353416 | 0.472081 | 0.152866 | 0.307692 |
| 18 | 0.7 | 0.675182 | 0.625731 | 0.518765 | 0.407407 | 0.423077 | 0.324324 | 0.09375 | 0.181818 | 0.000000 | 0.562130 | 0.368020 | 0.031847 | 0.184615 |
| 19 | 0.7 | 0.463504 | 0.660819 | 0.483228 | 0.444444 | 0.692308 | 0.459459 | 0.53125 | 0.522727 | 0.536585 | 0.504572 | 0.598985 | 0.519108 | 0.527473 |
| 20 | 0.9 | 0.474453 | 0.391813 | 0.479907 | 0.518519 | 0.730769 | 0.405405 | 0.34375 | 0.409091 | 0.463415 | 0.298010 | 0.469543 | 0.292994 | 0.397802 |
| 21 | 1.0 | 0.405109 | 0.333333 | 0.472268 | 0.444444 | 0.692308 | 0.648649 | 0.50000 | 0.409091 | 0.731707 | 0.211404 | 0.395939 | 0.331210 | 0.338462 |
| 22 | 0.7 | 0.474453 | 0.368421 | 0.447360 | 0.296296 | 0.384615 | 0.756757 | 0.21875 | 0.113636 | 0.365854 | 0.446477 | 0.139594 | 0.085987 | 0.070330 |
| 25 | 0.6 | 0.405109 | 0.730994 | 0.387247 | 0.148148 | 0.230769 | 0.540541 | 0.46875 | 0.318182 | 0.390244 | 0.466380 | 0.365482 | 0.305732 | 0.261538 |
| 26 | 0.9 | 0.302920 | 0.356725 | 0.364995 | 0.370370 | 0.711538 | 0.918919 | 0.53125 | 0.340909 | 0.756098 | 0.143088 | 0.324873 | 0.168790 | 0.153846 |
| 27 | 0.6 | 0.430657 | 0.561404 | 0.332116 | 0.037037 | 0.000000 | 0.567568 | 0.09375 | 0.000000 | 0.073171 | 0.367402 | 0.063452 | 0.022293 | 0.000000 |
| 30 | 0.3 | 0.259124 | 0.380117 | 0.229824 | 0.259259 | 0.557692 | 0.540541 | 0.18750 | 0.204545 | 0.292683 | 0.628295 | 0.291878 | 0.050955 | 0.156044 |
| 31 | 0.4 | 0.211679 | 0.362573 | 0.214879 | 0.222222 | 0.538462 | 0.486486 | 0.31250 | 0.318182 | 0.463415 | 0.402905 | 0.347716 | 0.257962 | 0.314286 |
| 32 | 0.4 | 0.116788 | 0.000000 | 0.160080 | 0.037037 | 0.134615 | 0.540541 | 0.31250 | 0.181818 | 0.756098 | 0.275955 | 0.000000 | 0.159236 | 0.136264 |
| 33 | 0.5 | 0.076642 | 0.163743 | 0.084357 | 0.074074 | 0.326923 | 0.351351 | 0.21875 | 0.250000 | 0.512195 | 0.000000 | 0.200508 | 0.264331 | 0.318681 |
| 34 | 0.0 | 0.065693 | 0.181287 | 0.034540 | 0.074074 | 0.346154 | 0.351351 | 0.00000 | 0.090909 | 0.195122 | 0.908553 | 0.134518 | 0.000000 | 0.131868 |
| 35 | 0.0 | 0.000000 | 0.461988 | 0.020259 | 0.037037 | 0.403846 | 1.000000 | 0.53125 | 0.227273 | 0.658537 | 0.831630 | 0.175127 | 0.347134 | 0.167033 |
| 36 | 0.2 | 0.018248 | 0.309942 | 0.000000 | 0.000000 | 0.173077 | 0.270270 | 0.12500 | 0.181818 | 0.268293 | 0.150619 | 0.190355 | 0.095541 | 0.197802 |
best_DTC.predict(df2022_normalized[0:1])
array([1])